The species I will be using for this lab is the American Flamingo, Phoenicopterus ruber. There is an image of an American Flamingo taken by my dad.
obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
# get species occurrence data from GBIF with coordinates
res <- spocc::occ(
query = 'Phoenicopterus ruber',
from = 'gbif',
has_coords = T,
limit = 10000
)
df <- res$gbif$data[[1]]
clean_df <- df %>%
filter(latitude < 40,
longitude < 100) %>%
select(c("name", "longitude", "latitude", "prov", "key", "lifeStage",
"continent", "stateProvince", "year", "species"))
nrow(clean_df) # number of rows
## [1] 9995
readr::write_csv(clean_df, obs_csv)
obs <- clean_df %>%
sf::st_as_sf(
coords = c("longitude", "latitude"),
crs = st_crs(4326)) %>%
select(prov, key) # save space (joinable from obs_csv)
sf::write_sf(obs, obs_geo, delete_dsn=T)
obs <- sf::read_sf(obs_geo)
nrow(obs)
## [1] 9995
Mapped distribution of the points are shown below
# show points on map
mapview::mapview(obs, map.types = "OpenTopoMap")
There are 85,299 observations of American Flamingoes in the GBIF database. After cleaning the 10,000 observations to remove 4 in zoos in Northern Europe, my map displayed 9,996 observations of flamingoes. I considered removing duplicates but flamingoes usually travel in large flocks so it is to be expected that there would be multiple at very similar coordinates.
dir_env <- file.path(dir_data, "env")
# set a default data directory
options(sdmpredictors_datadir = dir_env)
# choosing marine
env_datasets <- sdmpredictors::list_datasets(terrestrial = T, marine = F)
# show table of datasets
env_datasets %>%
select(dataset_code, description, citation) %>%
DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")
# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
When deciding on the environmental layers to use, I looked into what factors affected the species distribution of flamingos. The most common factors are temperature, depth of water and access to food sources. Because of this, I chose the annual mean temperature, mean daily temperature, temperature seasonality, annual precipitation, terrain and topographic wetness layers.
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_bio1", "WC_bio2", "WC_bio4", "WC_bio12", "ER_tri", "ER_topoWet")
# get layers
env_stack <- load_layers(env_layers_vec)
# interactive plot layers, hiding all but first (select others)
plot(env_stack, nc=2)
# obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
# make convex hull around points of observation
obs_hull <- sf::st_convex_hull(st_union(obs))
# save obs hull
write_sf(obs_hull, "data/sdm/obs_hull.geojson")
obs_hull <- read_sf("data/sdm/obs_hull.geojson")
# show points on map
mapview(list(obs, obs_hull))
obs_hull_sp <- sf::as_Spatial(obs_hull)
env_stack <- raster::mask(env_stack, obs_hull_sp) %>%
raster::crop(extent(obs_hull_sp))
writeRaster(env_stack, env_stack_grd, overwrite=T)
env_stack <- stack(env_stack_grd)
# show map
# mapview(obs) +
# mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)
absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
# get raster count of observations
r_obs <- rasterize(
sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
absence <- dismo::randomPoints(r_mask, nrow(obs)) %>%
as_tibble() %>%
st_as_sf(coords = c("x", "y"), crs = 4326)
write_sf(absence, absence_geo, delete_dsn=T)
absence <- read_sf(absence_geo)
# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") +
mapview(absence, col.regions = "gray")
# combine presence and absence into single set of labeled points
pts <- rbind(
obs %>%
mutate(
present = 1) %>%
select(present, key),
absence %>%
mutate(
present = 0,
key = NA)) %>%
mutate(
ID = 1:n()) %>%
relocate(ID)
write_sf(pts, pts_geo, delete_dsn=T)
# extract raster values for points
pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>%
tibble() %>%
# join present and geometry columns to raster value results for points
left_join(
pts %>%
select(ID, present),
by = "ID") %>%
relocate(present, .after = ID) %>%
# extract lon, lat as single columns
mutate(
#present = factor(present),
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]) %>%
select(-geometry)
write_csv(pts_env, pts_env_csv)
pts_env <- read_csv(pts_env_csv)
pts_env %>%
# show first 10 presence, last 10 absence
slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>%
DT::datatable(
rownames = F,
options = list(
dom = "t",
pageLength = 20))
pts_env %>%
select(-ID) %>%
mutate(
present = factor(present)) %>%
pivot_longer(-present) %>%
ggplot() +
geom_density(aes(x = value, fill = present)) +
scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0)) +
theme_bw() +
facet_wrap(~name, scales = "free") +
theme(
legend.position = c(1, 0),
legend.justification = c(1, 0))
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19666
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3436 -0.2330 -0.0042 0.2660 1.4528
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.337427114 0.047220821 49.500 < 0.0000000000000002 ***
## WC_bio1 -0.051314935 0.000998385 -51.398 < 0.0000000000000002 ***
## WC_bio2 -0.079296938 0.001190722 -66.596 < 0.0000000000000002 ***
## WC_bio4 0.000855992 0.000239617 3.572 0.000355 ***
## WC_bio12 -0.000248862 0.000004093 -60.800 < 0.0000000000000002 ***
## ER_tri -0.004050119 0.000206591 -19.605 < 0.0000000000000002 ***
## ER_topoWet 0.033489066 0.003434389 9.751 < 0.0000000000000002 ***
## lon -0.003750310 0.000088678 -42.291 < 0.0000000000000002 ***
## lat -0.004713234 0.000241597 -19.509 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.348 on 19657 degrees of freedom
## Multiple R-squared: 0.5156, Adjusted R-squared: 0.5154
## F-statistic: 2615 on 8 and 19657 DF, p-value: < 0.00000000000000022
y_predict <- predict(mdl, pts_env, type="response")
y_true <- pts_env$present
range(y_predict)
## [1] NA NA
range(y_true)
## [1] 0 1
# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7889 -0.4619 -0.0356 0.5161 4.1303
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 16.56154144 0.49657377 33.352 < 0.0000000000000002 ***
## WC_bio1 -0.44578866 0.01157672 -38.507 < 0.0000000000000002 ***
## WC_bio2 -0.63734106 0.01291311 -49.356 < 0.0000000000000002 ***
## WC_bio4 -0.00543979 0.00231069 -2.354 0.0186 *
## WC_bio12 -0.00196284 0.00004287 -45.783 < 0.0000000000000002 ***
## ER_tri -0.05231856 0.00260975 -20.047 < 0.0000000000000002 ***
## ER_topoWet 0.23003147 0.03362956 6.840 0.00000000000791 ***
## lon -0.02801416 0.00086082 -32.544 < 0.0000000000000002 ***
## lat -0.04085363 0.00219643 -18.600 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27259 on 19665 degrees of freedom
## Residual deviance: 13558 on 19657 degrees of freedom
## AIC: 13576
##
## Number of Fisher Scoring iterations: 6
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 0.0000001391132 0.9996481631125
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F)
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_bio1) + s(WC_bio2) + s(WC_bio4) + s(WC_bio12) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_bio1) + s(WC_bio2) + s(WC_bio4) + s(WC_bio12) +
## s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -76.20 8.36 -9.114 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_bio1) 7.539 7.962 217.34 < 0.0000000000000002 ***
## s(WC_bio2) 8.821 8.981 439.53 < 0.0000000000000002 ***
## s(WC_bio4) 8.988 9.000 265.35 < 0.0000000000000002 ***
## s(WC_bio12) 8.028 8.113 722.01 < 0.0000000000000002 ***
## s(ER_tri) 6.818 7.530 34.26 0.0000184 ***
## s(ER_topoWet) 7.815 8.528 54.90 < 0.0000000000000002 ***
## s(lon) 8.973 8.999 373.82 < 0.0000000000000002 ***
## s(lat) 8.815 8.981 583.92 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.874 Deviance explained = 83.6%
## UBRE = -0.76618 Scale est. = 1 n = 19666
# show term plots
plot(mdl, scale=0)
Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)?
WC_bio2 (mean daily temperature range) and lat (latitude) are the two GAM environmental variables that seem to contribute most towards presence. They both have periods where they are below zero, but when comparing them to the other variables they contribute more towards presence.
Maxent is probably the most commonly used species distribution model since it performs well with few input data points, only requires presence points and is easy to use with a Java graphical user interface (GUI).
# show version of maxent
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
mdl <- maxent(env_stack, obs_sp)
## This is MaxEnt version 3.4.3
plot(mdl)
# plot term plots
response(mdl)
Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results?
For the Maxent variables, WC_bio1 (annual mean temperature) and WC_bio2 (mean daily temperature range) seem to contribute most to presence. This is the same in the case of WC_bio2 but WC_bio1 differs from the GAM where it contributes more to absences.
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>%
select(-ID) %>% # not used as a predictor x
mutate(
present = factor(present)) %>% # categorical response
na.omit() # drop rows with NA
skim(d)
| Name | d |
| Number of rows | 19666 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| present | 0 | 1 | FALSE | 2 | 0: 9975, 1: 9691 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WC_bio1 | 0 | 1 | 22.62 | 4.35 | 4.90 | 18.40 | 24.20 | 26.30 | 30.60 | ▁▁▇▆▇ |
| WC_bio2 | 0 | 1 | 11.53 | 3.17 | 4.20 | 9.30 | 11.20 | 14.10 | 19.70 | ▂▇▇▆▁ |
| WC_bio4 | 0 | 1 | 26.18 | 19.47 | 1.44 | 8.82 | 20.96 | 37.50 | 87.85 | ▇▆▂▂▁ |
| WC_bio12 | 0 | 1 | 961.85 | 722.72 | 4.00 | 480.00 | 719.00 | 1383.00 | 7428.00 | ▇▂▁▁▁ |
| ER_tri | 0 | 1 | 14.50 | 21.63 | 0.00 | 2.88 | 7.86 | 16.07 | 269.51 | ▇▁▁▁▁ |
| ER_topoWet | 0 | 1 | 12.21 | 1.37 | 6.88 | 11.36 | 12.34 | 13.33 | 15.72 | ▁▂▆▇▁ |
| lon | 0 | 1 | -26.43 | 48.88 | -117.14 | -73.55 | -8.46 | 19.46 | 40.04 | ▃▅▁▂▇ |
| lat | 0 | 1 | 0.42 | 21.27 | -34.71 | -21.12 | 4.04 | 20.29 | 39.21 | ▆▃▅▇▃ |
# create training set with 80% of full data
d_split <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train <- rsample::training(d_split)
# show number of rows present is 0 vs 1
table(d$present)
##
## 0 1
## 9975 9691
table(d_train$present)
##
## 0 1
## 7980 7752
# run decision stump model
mdl <- rpart(
present ~ ., data = d_train,
control = list(
cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 15732
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 15732 7752 0 (0.50724638 0.49275362)
## 2) lat>=-25.49979 12253 4597 0 (0.62482657 0.37517343) *
## 3) lat< -25.49979 3479 324 1 (0.09313021 0.90686979) *
# plot tree
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)
# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 15732
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 15732 7752 0 (0.50724638 0.49275362)
## 2) lat>=-25.49979 12253 4597 0 (0.62482657 0.37517343)
## 4) lon>=-68.16808 6663 610 0 (0.90844965 0.09155035)
## 8) WC_bio1>=17.55 6404 434 0 (0.93222986 0.06777014)
## 16) lon>=-61.40179 5776 228 0 (0.96052632 0.03947368) *
## 17) lon< -61.40179 628 206 0 (0.67197452 0.32802548)
## 34) lat< 10.12695 411 1 0 (0.99756691 0.00243309) *
## 35) lat>=10.12695 217 12 1 (0.05529954 0.94470046) *
## 9) WC_bio1< 17.55 259 83 1 (0.32046332 0.67953668) *
## 5) lon< -68.16808 5590 1603 1 (0.28676208 0.71323792)
## 10) WC_bio12>=1504 673 28 0 (0.95839525 0.04160475) *
## 11) WC_bio12< 1504 4917 958 1 (0.19483425 0.80516575)
## 22) lon< -91.58806 649 46 0 (0.92912173 0.07087827) *
## 23) lon>=-91.58806 4268 355 1 (0.08317713 0.91682287)
## 46) WC_bio1< 19.75 151 8 0 (0.94701987 0.05298013) *
## 47) WC_bio1>=19.75 4117 212 1 (0.05149381 0.94850619) *
## 3) lat< -25.49979 3479 324 1 (0.09313021 0.90686979) *
rpart.plot(mdl)
# plot complexity parameter
plotcp(mdl)
# rpart cross validation results
mdl$cptable
## CP nsplit rel error xerror xstd
## 1 0.36519608 0 1.0000000 1.0000000 0.008089145
## 2 0.30753354 1 0.6348039 0.6351909 0.007502845
## 3 0.07959236 2 0.3272704 0.3276574 0.005953419
## 4 0.07185243 3 0.2476780 0.2521930 0.005337587
## 5 0.01741486 4 0.1758256 0.1769866 0.004565082
## 6 0.01229790 5 0.1584107 0.1597007 0.004356610
## 7 0.01000000 8 0.1215170 0.1231940 0.003863573
Based on the complexity plot threshold, what size of tree is recommended? Based on the threshold, the tree is recommended to have a size of 10.
# caret cross validation results
mdl_caret <- train(
present ~ .,
data = d_train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10),
tuneLength = 20)
ggplot(mdl_caret)
vip(mdl_caret, num_features = 40, bar = FALSE)
What are the top 3 most important variables of your model?
Based on the variable importance plot, the top three most important variables are latitude, WC_bio2 (daily temperature range), and longitude.
# commented these out because they were causing my code to crash.
# Construct partial dependence plots
# p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
# p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
# p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
# plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
# colorkey = TRUE, screen = list(z = -20, x = -60))
#
# # Display plots side by side
# gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
# number of features
n_features <- length(setdiff(names(d_train), "present"))
# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)
# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1149846
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
present ~ ., data = d_train,
importance = "impurity")
# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
present ~ ., data = d_train,
importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)
gridExtra::grid.arrange(p1, p2, nrow = 1)
How might variable importance differ between rpart and RandomForest in your model outputs? Variable importance for rpart and impurity based RandomForest modeling remained consistent, but for permutation based random forest, WC_bio2 became more important than WC_bio12.
# create training set with 80% of full data
pts_split <- rsample::initial_split(
pts, prop = 0.8, strata = "present")
pts_train <- rsample::training(pts_split)
pts_test <- rsample::testing(pts_split)
pts_train_p <- pts_train %>%
filter(present == 1) %>%
as_Spatial()
pts_train_a <- pts_train %>%
filter(present == 0) %>%
as_Spatial()
# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)
# each point in the scatter plot on the lower triangle represents a pixel that is placed on the x and y axis based on the variables it is comparing.
# the correlation is how tight the correspondance of the scatter plot is
# the numbers are smaller where there is less correlation
# the highest vif is the highest multicorrelation
# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
## Variables VIF
## 1 WC_bio1 1.494759
## 2 WC_bio2 2.330385
## 3 WC_bio4 2.499115
## 4 WC_bio12 2.314188
## 5 ER_tri 3.234874
## 6 ER_topoWet 3.748889
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7)
v
## 1 variables from the 6 input variables have collinearity problem:
##
## ER_topoWet
##
## After excluding the collinear variables, the linear correlation coefficients ranges between:
## min correlation ( ER_tri ~ WC_bio12 ): 0.07962199
## max correlation ( WC_bio12 ~ WC_bio2 ): -0.6979262
##
## ---------- VIFs of the remained variables --------
## Variables VIF
## 1 WC_bio1 1.388594
## 2 WC_bio2 2.329276
## 3 WC_bio4 2.437081
## 4 WC_bio12 2.334120
## 5 ER_tri 1.290442
# gives you a reduced set of predictors
# reduce enviromental raster stack by
env_stack_v <- usdm::exclude(env_stack, v)
# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)
# using the v above, none of them should have a correlation coefficient grater than 0.7 because of the threshold.
# fit a maximum entropy model
mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
## This is MaxEnt version 3.4.3
readr::write_rds(mdl_maxv, mdl_maxv_rds)
mdl_maxv <- read_rds(mdl_maxv_rds)
# plot variable contributions per predictor
plot(mdl_maxv)
# plot term plots
response(mdl_maxv)
Which variables were removed due to multicollinearity and what is the rank of most to least important remaining variables in your model? WC_bio4 and ER_topowet were removed due to multicollinearity. The importance ranking is WC_bio1, WC_bio2, WC_bio12 and ER_tri with bio1 being the most important.
# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')
plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
pts_test_p <- pts_test %>%
filter(present == 1) %>%
as_Spatial()
pts_test_a <- pts_test %>%
filter(present == 0) %>%
as_Spatial()
y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)
# this is a prediction and youre comparing the prediction to our known presence and absence points.
e <- dismo::evaluate(
p = pts_test_p,
a = pts_test_a,
model = mdl_maxv,
x = env_stack)
e
## class : ModelEvaluation
## n presences : 1933
## n absences : 1996
## AUC : 0.8805386
## cor : 0.6780314
## max TPR+TNR at : 0.6543424
plot(e, 'ROC')
thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6543424
# of the presences observed, how many true predictions are we getting.
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
# of the absences observed, how many true predictions are we getting
# anything less than the threshold will be a 0
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)
# TPR & TNR is true positive and negative rates
# (t)rue/(f)alse (p)ositive/(n)egative rates
# p_true is a vector of trues and falses.
# this gives us a rate
tpr <- sum(p_true)/length(p_true)
# how many we observed were present but we predicted absence
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
#the terms above populate the matrix below
matrix(
c(tpr, fnr,
fpr, tnr),
nrow=2, dimnames = list(
c("present_obs", "absent_obs"),
c("present_pred", "absent_pred")))
## present_pred absent_pred
## present_obs 0.7982411 0.1172345
## absent_obs 0.2017589 0.8827655
# add point to ROC plot
points(fpr, tpr, pch=23, bg="blue")
# any of the points along the curve represent a different value for 0 and 1 which is different than the axis
# Ymax v is the predicted maxent from the environmental stack
# applying the threshold to give you the distribution of 1s in green and 0s in grey for habitat no habitat.
plot(y_maxv > thr)